ds = read.csv("AB_NYC_2019.csv")
head(ds)
## id name host_id host_name
## 1 2539 Clean & quiet apt home by the park 2787 John
## 2 2595 Skylit Midtown Castle 2845 Jennifer
## 3 3647 THE VILLAGE OF HARLEM....NEW YORK ! 4632 Elisabeth
## 4 3831 Cozy Entire Floor of Brownstone 4869 LisaRoxanne
## 5 5022 Entire Apt: Spacious Studio/Loft by central park 7192 Laura
## 6 5099 Large Cozy 1 BR Apartment In Midtown East 7322 Chris
## neighbourhood_group neighbourhood latitude longitude room_type price
## 1 Brooklyn Kensington 40.64749 -73.97237 Private room 149
## 2 Manhattan Midtown 40.75362 -73.98377 Entire home/apt 225
## 3 Manhattan Harlem 40.80902 -73.94190 Private room 150
## 4 Brooklyn Clinton Hill 40.68514 -73.95976 Entire home/apt 89
## 5 Manhattan East Harlem 40.79851 -73.94399 Entire home/apt 80
## 6 Manhattan Murray Hill 40.74767 -73.97500 Entire home/apt 200
## minimum_nights number_of_reviews last_review reviews_per_month
## 1 1 9 2018-10-19 0.21
## 2 1 45 2019-05-21 0.38
## 3 3 0 NA
## 4 1 270 2019-07-05 4.64
## 5 10 9 2018-11-19 0.10
## 6 3 74 2019-06-22 0.59
## calculated_host_listings_count availability_365
## 1 6 365
## 2 2 355
## 3 1 365
## 4 1 194
## 5 1 0
## 6 1 129
#Check for NA
apply(ds,2,function(x) sum(is.na(x)))
## id name
## 0 0
## host_id host_name
## 0 0
## neighbourhood_group neighbourhood
## 0 0
## latitude longitude
## 0 0
## room_type price
## 0 0
## minimum_nights number_of_reviews
## 0 0
## last_review reviews_per_month
## 0 10052
## calculated_host_listings_count availability_365
## 0 0
# NOTES
# Remove NA, empty
#
#
#
#
normalize <- function(x) {
return ((x - min(x)) / (max(x) - min(x)))
}
clean_data = function(ds)
{
ds = select (ds,-c(host_id, id, host_name, name,minimum_nights,number_of_reviews,neighbourhood,last_review,availability_365,
reviews_per_month,calculated_host_listings_count))
numerical = c("price","longitude", "latitude")
categorical = c("neighbourhood_group")
ds[numerical] = scale(ds[numerical])
ds$neighbourhood_group = factor(ds$neighbourhood_group,
level= c("Brooklyn","Manhattan",
"Queens","Staten Island", "Bronx"),
labels=c(1,2,3,4,5))
ds$room_type = factor(ds$room_type,
level= c("Private room","Entire home/apt","Shared room"),
labels=c(1,2,3))
return(ds)
}
#ggdraw() +
# draw_image("New_York_City_.png") +
# draw_plot(myplot)
dataset = clean_data(ds)
head(dataset)
## neighbourhood_group latitude longitude room_type price
## 1 1 -1.4938339 -0.4376476 1 -0.01549291
## 2 2 0.4524314 -0.6846321 2 0.30097047
## 3 2 1.4683845 0.2224944 1 -0.01132892
## 4 1 -0.8033893 -0.1644481 2 -0.26533242
## 5 2 1.2756468 0.1772139 2 -0.30280835
## 6 2 0.3433173 -0.4946274 2 0.19687067
library(caTools)
## Warning: package 'caTools' was built under R version 3.6.3
library(caret)
## Warning: package 'caret' was built under R version 3.6.3
## Loading required package: lattice
data_clean = dataset
sample = sample.split(data_clean, SplitRatio = .75)
train = subset(data_clean, sample == TRUE)
test = subset(data_clean, sample == FALSE)
print("Initial data shape")
## [1] "Initial data shape"
print(dim(data_clean))
## [1] 48895 5
print("Train shape")
## [1] "Train shape"
print(dim(train))
## [1] 29337 5
print("Test shape")
## [1] "Test shape"
print(dim(test))
## [1] 19558 5
https://www.guru99.com/r-random-forest-tutorial.html
GOOD VIDEO FOR PARAMETER OPTIMISATION https://www.youtube.com/watch?v=6EXPYzbfLCE
TUNING https://uc-r.github.io/random_forests
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.6.3
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
rf <- randomForest(
neighbourhood_group ~ . ,
data=train,
# importance = TRUE
)
rf
##
## Call:
## randomForest(formula = neighbourhood_group ~ ., data = train, )
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 0.13%
## Confusion matrix:
## 1 2 3 4 5 class.error
## 1 12083 1 12 0 0 0.0010747354
## 2 0 12947 0 0 2 0.0001544521
## 3 16 2 3387 0 0 0.0052863436
## 4 0 0 0 239 0 0.0000000000
## 5 0 4 0 0 644 0.0061728395
#https://www.r-bloggers.com/how-to-implement-random-forests-in-r/
varImpPlot(rf)
pred = predict(rf, test)
head(pred)
## 2 4 7 9 12 14
## 2 1 1 2 2 2
## Levels: 1 2 3 4 5
plot(pred)
table(pred,test$neighbourhood_group)
##
## pred 1 2 3 4 5
## 1 8005 2 7 0 0
## 2 1 8710 0 0 4
## 3 2 0 2253 0 0
## 4 0 0 0 134 0
## 5 0 0 1 0 439
confusionMatrix(pred, test$neighbourhood_group)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3 4 5
## 1 8005 2 7 0 0
## 2 1 8710 0 0 4
## 3 2 0 2253 0 0
## 4 0 0 0 134 0
## 5 0 0 1 0 439
##
## Overall Statistics
##
## Accuracy : 0.9991
## 95% CI : (0.9986, 0.9995)
## No Information Rate : 0.4454
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9986
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity 0.9996 0.9998 0.9965 1.000000 0.99097
## Specificity 0.9992 0.9995 0.9999 1.000000 0.99995
## Pos Pred Value 0.9989 0.9994 0.9991 1.000000 0.99773
## Neg Pred Value 0.9997 0.9998 0.9995 1.000000 0.99979
## Prevalence 0.4094 0.4454 0.1156 0.006851 0.02265
## Detection Rate 0.4093 0.4453 0.1152 0.006851 0.02245
## Detection Prevalence 0.4098 0.4456 0.1153 0.006851 0.02250
## Balanced Accuracy 0.9994 0.9997 0.9982 1.000000 0.99546
rf2 <- randomForest(
price ~ . ,
data=train,
# importance = TRUE
)
rf2
##
## Call:
## randomForest(formula = price ~ ., data = train, )
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 1
##
## Mean of squared residuals: 0.8848527
## % Var explained: 9.53
varImpPlot(rf2)
par(mfrow=c(1,2))
plot(test$price,predict(rf2,test),col='red',main='Real vs predicted RF',pch=18,cex=0.7)
abline(0,1,lwd=2)
legend('bottomright',legend='NN',pch=18,col='red', bty='n')
library(randomForest)
library(e1071)
## Warning: package 'e1071' was built under R version 3.6.3
trControl <- trainControl(method = "cv",
number = 10,
search = "grid",
allowParallel = TRUE
)
trControl$method
## [1] "cv"
trControl$number
## [1] 10
registerDoFuture() plan(multiprocess, workers = availableCores() - 1)
library(doParallel) cl <- makeCluster(detectCores()) registerDoParallel(cl) #### machine learning code goes in here stopCluster(cl)
start.time <- Sys.time() …Relevent codes… end.time <- Sys.time() time.taken <- end.time - start.time time.taken
library(doParallel)
## Warning: package 'doParallel' was built under R version 3.6.3
## Loading required package: foreach
## Warning: package 'foreach' was built under R version 3.6.3
## Loading required package: iterators
## Warning: package 'iterators' was built under R version 3.6.3
## Loading required package: parallel
library(doFuture)
## Warning: package 'doFuture' was built under R version 3.6.3
## Loading required package: globals
## Loading required package: future
##
## Attaching package: 'future'
## The following object is masked from 'package:caret':
##
## cluster
start.time <- Sys.time()
#ptm <- proc.time()
rf_default <- caret::train(
neighbourhood_group ~ . ,
data=train,
method = "rf",
metric="Accuracy",
trControl = trControl,
num.threads = availableCores() # <- This one
)
end.time <- Sys.time()
time.taken <- end.time - start.time
print("-- Time: -- ")
## [1] "-- Time: -- "
time.taken
## Time difference of 5.188274 mins
print("")
## [1] ""
print("-- RANDOM FOREST -- ")
## [1] "-- RANDOM FOREST -- "
#proc.time() - ptm
print(rf_default)
## Random Forest
##
## 29337 samples
## 4 predictor
## 5 classes: '1', '2', '3', '4', '5'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 26402, 26402, 26405, 26405, 26403, 26403, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.9977504 0.9963760
## 3 0.9986025 0.9977494
## 5 0.9983979 0.9974200
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 3.
#ptm <- proc.time()
start.time <- Sys.time()
rf_default <- caret::train(
neighbourhood_group ~ . ,
data=train,
method = "rf",
metric="Accuracy",
trControl = trControl,
)
#proc.time() - ptm
end.time <- Sys.time()
time.taken <- end.time - start.time
print("-- Time: -- ")
## [1] "-- Time: -- "
time.taken
## Time difference of 5.623895 mins
print("")
## [1] ""
print(rf_default)
## Random Forest
##
## 29337 samples
## 4 predictor
## 5 classes: '1', '2', '3', '4', '5'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 26402, 26403, 26403, 26404, 26402, 26403, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.9979207 0.9966505
## 3 0.9988070 0.9980790
## 5 0.9986706 0.9978591
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 3.
start.time <- Sys.time()
set.seed(1234)
tuneGrid <- expand.grid(.mtry = c(1: 10))
rf_mtry <- caret::train(
neighbourhood_group~.,
data = train,
method = "rf",
metric = "Accuracy",
tuneGrid = tuneGrid,
trControl = trControl,
importance = TRUE,
nodesize = 14,
ntree = 300,
num.threads = availableCores()-1
)
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
end.time <- Sys.time()
time.taken <- end.time - start.time
print("-- Time: -- ")
## [1] "-- Time: -- "
time.taken
## Time difference of 10.68638 mins
print("")
## [1] ""
print(rf_mtry)
## Random Forest
##
## 29337 samples
## 4 predictor
## 5 classes: '1', '2', '3', '4', '5'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 26404, 26403, 26402, 26403, 26403, 26404, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 1 0.9192489 0.8663930
## 2 0.9974435 0.9958825
## 3 0.9985684 0.9976949
## 4 0.9985002 0.9975854
## 5 0.9980911 0.9969263
## 6 0.9980911 0.9969265
## 7 0.9980911 0.9969263
## 8 0.9980571 0.9968714
## 9 0.9980230 0.9968167
## 10 0.9981593 0.9970361
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 3.
best_mtry <- rf_mtry$bestTune$mtry
cat("Best mtry value:",best_mtry)
## Best mtry value: 3
cat("\nMax accuracy mtry:", max(rf_mtry$results$Accuracy))
##
## Max accuracy mtry: 0.9985684
start.time <- Sys.time()
store_maxnode <- list()
tuneGrid <- expand.grid(.mtry = best_mtry)
for (maxnodes in c(5: 15)) {
set.seed(1234)
rf_maxnode <- train(neighbourhood_group~.,
data = train,
method = "rf",
metric = "Accuracy",
tuneGrid = tuneGrid,
trControl = trControl,
importance = TRUE,
nodesize = 14,
maxnodes = maxnodes,
ntree = 300,
)
current_iteration <- toString(maxnodes)
store_maxnode[[current_iteration]] <- rf_maxnode
}
end.time <- Sys.time()
time.taken <- end.time - start.time
print("-- Time: -- ")
## [1] "-- Time: -- "
time.taken
## Time difference of 7.612453 mins
print("")
## [1] ""
results_mtry <- resamples(store_maxnode)
summary(results_mtry)
##
## Call:
## summary.resamples(object = results_mtry)
##
## Models: 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
## Number of resamples: 10
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 5 0.8649847 0.8693425 0.8774922 0.8759245 0.8823079 0.8837368 0
## 6 0.8653256 0.8696834 0.8786850 0.8765380 0.8822626 0.8847596 0
## 7 0.8660075 0.8702284 0.8797072 0.8769128 0.8827139 0.8844187 0
## 8 0.8677122 0.8721134 0.8810301 0.8827754 0.8864081 0.9048756 0
## 9 0.8700989 0.8944766 0.9006476 0.8986593 0.9052407 0.9155026 0
## 10 0.9072938 0.9092459 0.9139543 0.9167942 0.9213263 0.9321976 0
## 11 0.9004432 0.9099121 0.9205728 0.9185315 0.9255470 0.9328333 0
## 12 0.9093079 0.9178385 0.9246888 0.9233723 0.9309640 0.9321976 0
## 13 0.9233129 0.9307024 0.9340605 0.9338720 0.9383150 0.9444255 0
## 14 0.9287419 0.9352253 0.9398434 0.9398368 0.9434270 0.9532901 0
## 15 0.9386294 0.9430811 0.9475109 0.9491422 0.9545801 0.9628367 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 5 0.7793972 0.7864172 0.7996329 0.7971574 0.8076095 0.8098218 0
## 6 0.7799906 0.7868578 0.8016329 0.7980972 0.8071669 0.8117301 0
## 7 0.7811803 0.7881542 0.8033289 0.7989014 0.8083707 0.8113753 0
## 8 0.7838301 0.7910356 0.8055847 0.8085243 0.8146300 0.8450767 0
## 9 0.7876248 0.8273720 0.8376029 0.8343309 0.8453941 0.8618689 0
## 10 0.8483833 0.8515134 0.8592176 0.8639096 0.8713883 0.8890951 0
## 11 0.8369987 0.8526091 0.8701376 0.8667766 0.8782680 0.8904073 0
## 12 0.8516275 0.8656820 0.8769131 0.8747760 0.8872923 0.8891170 0
## 13 0.8746033 0.8868985 0.8923119 0.8920415 0.8994440 0.9094351 0
## 14 0.8836125 0.8943161 0.9016622 0.9017820 0.9077138 0.9238400 0
## 15 0.8998846 0.9071320 0.9141669 0.9169273 0.9257702 0.9392863 0
start.time <- Sys.time()
for (maxnodes in c(15 : 30)) {
set.seed(1234)
rf_maxnode <- train(neighbourhood_group~.,
data = train,
method = "rf",
metric = "Accuracy",
tuneGrid = tuneGrid,
trControl = trControl,
importance = TRUE,
nodesize = 14,
maxnodes = maxnodes,
ntree = 300)
current_iteration <- toString(maxnodes)
store_maxnode[[current_iteration]] <- rf_maxnode
}
end.time <- Sys.time()
time.taken <- end.time - start.time
print("-- Time: -- ")
## [1] "-- Time: -- "
time.taken
## Time difference of 14.05081 mins
print("")
## [1] ""
results_mtry <- resamples(store_maxnode)
summary(results_mtry)
##
## Call:
## summary.resamples(object = results_mtry)
##
## Models: 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30
## Number of resamples: 10
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 5 0.8649847 0.8693425 0.8774922 0.8759245 0.8823079 0.8837368 0
## 6 0.8653256 0.8696834 0.8786850 0.8765380 0.8822626 0.8847596 0
## 7 0.8660075 0.8702284 0.8797072 0.8769128 0.8827139 0.8844187 0
## 8 0.8677122 0.8721134 0.8810301 0.8827754 0.8864081 0.9048756 0
## 9 0.8700989 0.8944766 0.9006476 0.8986593 0.9052407 0.9155026 0
## 10 0.9072938 0.9092459 0.9139543 0.9167942 0.9213263 0.9321976 0
## 11 0.9004432 0.9099121 0.9205728 0.9185315 0.9255470 0.9328333 0
## 12 0.9093079 0.9178385 0.9246888 0.9233723 0.9309640 0.9321976 0
## 13 0.9233129 0.9307024 0.9340605 0.9338720 0.9383150 0.9444255 0
## 14 0.9287419 0.9352253 0.9398434 0.9398368 0.9434270 0.9532901 0
## 15 0.9386294 0.9430811 0.9475109 0.9491422 0.9545801 0.9628367 0
## 16 0.9458078 0.9495397 0.9529734 0.9542213 0.9597878 0.9635186 0
## 17 0.9515854 0.9556843 0.9587594 0.9593693 0.9642095 0.9682919 0
## 18 0.9519264 0.9562883 0.9613024 0.9607653 0.9656668 0.9696763 0
## 19 0.9498807 0.9575665 0.9631776 0.9628796 0.9673624 0.9741056 0
## 20 0.9567144 0.9601304 0.9708590 0.9679253 0.9734061 0.9778459 0
## 21 0.9706785 0.9719570 0.9759713 0.9751506 0.9770810 0.9809069 0
## 22 0.9723926 0.9750297 0.9768155 0.9770937 0.9791259 0.9819359 0
## 23 0.9734151 0.9760484 0.9769938 0.9776731 0.9804040 0.9819359 0
## 24 0.9737470 0.9762189 0.9780164 0.9780140 0.9799779 0.9819359 0
## 25 0.9751193 0.9779236 0.9786982 0.9795819 0.9827000 0.9839809 0
## 26 0.9737470 0.9786091 0.9798841 0.9796161 0.9822813 0.9843164 0
## 27 0.9758010 0.9796284 0.9812546 0.9811841 0.9837267 0.9849983 0
## 28 0.9771643 0.9788722 0.9809069 0.9810819 0.9834697 0.9853392 0
## 29 0.9774974 0.9800648 0.9809069 0.9817295 0.9841527 0.9863621 0
## 30 0.9781868 0.9799693 0.9819363 0.9821385 0.9846601 0.9856851 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 5 0.7793972 0.7864172 0.7996329 0.7971574 0.8076095 0.8098218 0
## 6 0.7799906 0.7868578 0.8016329 0.7980972 0.8071669 0.8117301 0
## 7 0.7811803 0.7881542 0.8033289 0.7989014 0.8083707 0.8113753 0
## 8 0.7838301 0.7910356 0.8055847 0.8085243 0.8146300 0.8450767 0
## 9 0.7876248 0.8273720 0.8376029 0.8343309 0.8453941 0.8618689 0
## 10 0.8483833 0.8515134 0.8592176 0.8639096 0.8713883 0.8890951 0
## 11 0.8369987 0.8526091 0.8701376 0.8667766 0.8782680 0.8904073 0
## 12 0.8516275 0.8656820 0.8769131 0.8747760 0.8872923 0.8891170 0
## 13 0.8746033 0.8868985 0.8923119 0.8920415 0.8994440 0.9094351 0
## 14 0.8836125 0.8943161 0.9016622 0.9017820 0.9077138 0.9238400 0
## 15 0.8998846 0.9071320 0.9141669 0.9169273 0.9257702 0.9392863 0
## 16 0.9117547 0.9178268 0.9235396 0.9255196 0.9345687 0.9407029 0
## 17 0.9212544 0.9279546 0.9330828 0.9339460 0.9417676 0.9485093 0
## 18 0.9218246 0.9290189 0.9370455 0.9362336 0.9441766 0.9506521 0
## 19 0.9185262 0.9311699 0.9401894 0.9397016 0.9469434 0.9579223 0
## 20 0.9296463 0.9351900 0.9526127 0.9479116 0.9568140 0.9640983 0
## 21 0.9523150 0.9544444 0.9609792 0.9596524 0.9628454 0.9689943 0
## 22 0.9551685 0.9594814 0.9623503 0.9628126 0.9661463 0.9707219 0
## 23 0.9567988 0.9610981 0.9626597 0.9637459 0.9682097 0.9706991 0
## 24 0.9573366 0.9613930 0.9642862 0.9642935 0.9675180 0.9707000 0
## 25 0.9595853 0.9641321 0.9653823 0.9668399 0.9719168 0.9740505 0
## 26 0.9573428 0.9652395 0.9673369 0.9669018 0.9712556 0.9745665 0
## 27 0.9607108 0.9668990 0.9695829 0.9694608 0.9736205 0.9756806 0
## 28 0.9628847 0.9656645 0.9690203 0.9692872 0.9732049 0.9762372 0
## 29 0.9634154 0.9676231 0.9690040 0.9703438 0.9743209 0.9779104 0
## 30 0.9645500 0.9674709 0.9706882 0.9710099 0.9751228 0.9768119 0
Result: 30
Now that you have the best value of mtry and maxnode, you can tune the number of trees. The method is exactly the same as maxnode.
start.time <- Sys.time()
store_maxtrees <- list()
for (ntree in c(250, 300, 350, 400, 450, 500, 550, 600, 800, 1000, 2000)) {
set.seed(5678)
rf_maxtrees <- train(neighbourhood_group~.,
data = train,
method = "rf",
metric = "Accuracy",
tuneGrid = tuneGrid,
trControl = trControl,
importance = TRUE,
nodesize = 14,
maxnodes = 24,
ntree = ntree)
key <- toString(ntree)
store_maxtrees[[key]] <- rf_maxtrees
}
end.time <- Sys.time()
time.taken <- end.time - start.time
print("-- Time: -- ")
## [1] "-- Time: -- "
time.taken
## Time difference of 20.17586 mins
print("")
## [1] ""
results_tree <- resamples(store_maxtrees)
summary(results_tree)
##
## Call:
## summary.resamples(object = results_tree)
##
## Models: 250, 300, 350, 400, 450, 500, 550, 600, 800, 1000, 2000
## Number of resamples: 10
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 250 0.9754518 0.9769126 0.9783535 0.9785596 0.9797954 0.9822768 0
## 300 0.9761337 0.9773386 0.9781831 0.9788663 0.9799660 0.9826176 0
## 350 0.9754518 0.9775827 0.9778497 0.9788663 0.9802284 0.9826176 0
## 400 0.9754518 0.9769128 0.9781905 0.9787640 0.9801364 0.9826176 0
## 450 0.9757927 0.9775942 0.9780201 0.9789003 0.9802284 0.9826176 0
## 500 0.9757927 0.9775128 0.9778422 0.9787299 0.9798022 0.9826176 0
## 550 0.9754518 0.9771604 0.9776794 0.9786617 0.9798022 0.9829584 0
## 600 0.9757927 0.9772456 0.9778498 0.9786958 0.9797954 0.9826176 0
## 800 0.9757927 0.9775942 0.9778497 0.9787980 0.9798022 0.9826176 0
## 1000 0.9757927 0.9775013 0.9776794 0.9787299 0.9797954 0.9826176 0
## 2000 0.9754518 0.9772534 0.9780201 0.9786958 0.9798022 0.9822768 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 250 0.9600825 0.9624691 0.9648381 0.9651800 0.9672193 0.9712526 0
## 300 0.9611893 0.9631733 0.9645575 0.9656789 0.9674895 0.9718120 0
## 350 0.9600832 0.9635485 0.9640089 0.9656738 0.9679314 0.9718105 0
## 400 0.9600667 0.9624696 0.9645621 0.9655031 0.9677586 0.9718105 0
## 450 0.9606278 0.9635856 0.9642846 0.9657281 0.9679314 0.9718100 0
## 500 0.9606278 0.9634492 0.9640085 0.9654539 0.9672477 0.9718100 0
## 550 0.9600667 0.9629186 0.9637187 0.9653449 0.9672477 0.9723673 0
## 600 0.9606278 0.9630503 0.9640081 0.9654002 0.9672160 0.9718100 0
## 800 0.9606278 0.9635816 0.9640128 0.9655633 0.9672457 0.9718105 0
## 1000 0.9606278 0.9634492 0.9637324 0.9654556 0.9672160 0.9718105 0
## 2000 0.9600825 0.9630424 0.9642846 0.9654014 0.9672509 0.9712526 0
fit_rf <- train(neighbourhood_group~.,
train,
method = "rf",
metric = "Accuracy",
tuneGrid = tuneGrid,
trControl = trControl,
importance = TRUE,
nodesize = 14,
ntree = 1000,
maxnodes = 30)
prediction <-predict(fit_rf, test)
prediction_default <-predict(rf, test)
conf = confusionMatrix(prediction, test$neighbourhood_group)
conf_default = confusionMatrix(prediction_default, test$neighbourhood_group)
conf
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3 4 5
## 1 7999 5 124 0 0
## 2 5 8697 171 0 22
## 3 4 0 1966 0 0
## 4 0 0 0 134 0
## 5 0 10 0 0 421
##
## Overall Statistics
##
## Accuracy : 0.9826
## 95% CI : (0.9806, 0.9844)
## No Information Rate : 0.4454
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9717
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity 0.9989 0.9983 0.8695 1.000000 0.95034
## Specificity 0.9888 0.9817 0.9998 1.000000 0.99948
## Pos Pred Value 0.9841 0.9777 0.9980 1.000000 0.97680
## Neg Pred Value 0.9992 0.9986 0.9832 1.000000 0.99885
## Prevalence 0.4094 0.4454 0.1156 0.006851 0.02265
## Detection Rate 0.4090 0.4447 0.1005 0.006851 0.02153
## Detection Prevalence 0.4156 0.4548 0.1007 0.006851 0.02204
## Balanced Accuracy 0.9939 0.9900 0.9346 1.000000 0.97491
conf_default
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3 4 5
## 1 8005 2 7 0 0
## 2 1 8710 0 0 4
## 3 2 0 2253 0 0
## 4 0 0 0 134 0
## 5 0 0 1 0 439
##
## Overall Statistics
##
## Accuracy : 0.9991
## 95% CI : (0.9986, 0.9995)
## No Information Rate : 0.4454
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9986
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity 0.9996 0.9998 0.9965 1.000000 0.99097
## Specificity 0.9992 0.9995 0.9999 1.000000 0.99995
## Pos Pred Value 0.9989 0.9994 0.9991 1.000000 0.99773
## Neg Pred Value 0.9997 0.9998 0.9995 1.000000 0.99979
## Prevalence 0.4094 0.4454 0.1156 0.006851 0.02265
## Detection Rate 0.4093 0.4453 0.1152 0.006851 0.02245
## Detection Prevalence 0.4098 0.4456 0.1153 0.006851 0.02250
## Balanced Accuracy 0.9994 0.9997 0.9982 1.000000 0.99546
#Step 6) Visualize Result
#varImpPlot(fit_rf)
library(ranger)
## Warning: package 'ranger' was built under R version 3.6.3
##
## Attaching package: 'ranger'
## The following object is masked from 'package:randomForest':
##
## importance
library(tuneRanger)
## Warning: package 'tuneRanger' was built under R version 3.6.3
## Loading required package: mlrMBO
## Warning: package 'mlrMBO' was built under R version 3.6.3
## Loading required package: mlr
## Warning: package 'mlr' was built under R version 3.6.3
## Loading required package: ParamHelpers
## Warning: package 'ParamHelpers' was built under R version 3.6.3
## 'mlr' is in maintenance mode since July 2019. Future development
## efforts will go into its successor 'mlr3' (<https://mlr3.mlr-org.com>).
##
## Attaching package: 'mlr'
## The following object is masked from 'package:e1071':
##
## impute
## The following object is masked from 'package:caret':
##
## train
## Loading required package: smoof
## Warning: package 'smoof' was built under R version 3.6.3
## Loading required package: checkmate
## Loading required package: lubridate
## Warning: package 'lubridate' was built under R version 3.6.3
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:dplyr':
##
## intersect, setdiff, union
## The following object is masked from 'package:cowplot':
##
## stamp
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
## Loading required package: lhs
## Warning: package 'lhs' was built under R version 3.6.3
ranger <- ranger( neighbourhood_group~ ., data = train, write.forest = TRUE, classification = T)
ranger_pred = predict(ranger, data = test)
tab = table(test$neighbourhood_group, predictions(ranger_pred))
plot(tab)
rangerReg <- ranger( price~ ., data = train, write.forest = TRUE, classification = F)
rangerReg
## Ranger result
##
## Call:
## ranger(price ~ ., data = train, write.forest = TRUE, classification = F)
##
## Type: Regression
## Number of trees: 500
## Sample size: 29337
## Number of independent variables: 4
## Mtry: 2
## Target node size: 5
## Variable importance mode: none
## Splitrule: variance
## OOB prediction error (MSE): 0.8986566
## R squared (OOB): 0.08119909
rangerReg_pred = predict(rangerReg, data = test)
rangerReg_pred
## Ranger prediction
##
## Type: Regression
## Sample size: 19558
## Number of independent variables: 4
tab = table(test$price, predictions(rangerReg_pred))
#tab
library(plotly)
## Warning: package 'plotly' was built under R version 3.6.3
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggmap':
##
## wind
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
plot_ly(x = test$price, y = predictions(rangerReg_pred))
## No trace type specified:
## Based on info supplied, a 'scatter' trace seems appropriate.
## Read more about this trace type -> https://plot.ly/r/reference/#scatter
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
library(tuneRanger)
# https://github.com/PhilippPro/tuneRanger
# https://mlr.mlr-org.com/articles/tutorial/measures.html
task = makeRegrTask(data = train, target = "price")
estimateTimeTuneRanger(task, iters = 20, num.threads = 8, num.trees = 1000)
## Approximated time for tuning: 4M 3S
res = tuneRanger(task, measure = list(mse), num.trees = 1000,
num.threads = 8, iters = 20, show.info = getOption("mlrMBO.show.info", TRUE))
## Computing y column(s) for design. Not provided.
## [mbo] 0: mtry=1; min.node.size=230; sample.fraction=0.777 : y = 0.885 : 3.0 secs : initdesign
## [mbo] 0: mtry=1; min.node.size=933; sample.fraction=0.449 : y = 0.886 : 2.3 secs : initdesign
## [mbo] 0: mtry=3; min.node.size=8; sample.fraction=0.749 : y = 0.979 : 27.0 secs : initdesign
## [mbo] 0: mtry=2; min.node.size=19; sample.fraction=0.549 : y = 0.889 : 8.8 secs : initdesign
## [mbo] 0: mtry=3; min.node.size=56; sample.fraction=0.654 : y = 0.911 : 17.3 secs : initdesign
## [mbo] 0: mtry=2; min.node.size=1.1e+03; sample.fraction=0.233 : y = 0.885 : 2.4 secs : initdesign
## [mbo] 0: mtry=4; min.node.size=129; sample.fraction=0.373 : y = 0.888 : 9.3 secs : initdesign
## [mbo] 0: mtry=2; min.node.size=2.18e+03; sample.fraction=0.509 : y = 0.885 : 2.4 secs : initdesign
## [mbo] 0: mtry=4; min.node.size=6; sample.fraction=0.809 : y = 1.11 : 36.1 secs : initdesign
## [mbo] 0: mtry=1; min.node.size=2; sample.fraction=0.463 : y = 0.885 : 2.6 secs : initdesign
## [mbo] 0: mtry=3; min.node.size=66; sample.fraction=0.223 : y = 0.878 : 5.0 secs : initdesign
## [mbo] 0: mtry=3; min.node.size=338; sample.fraction=0.359 : y = 0.884 : 4.0 secs : initdesign
## [mbo] 0: mtry=3; min.node.size=4; sample.fraction=0.334 : y = 0.91 : 11.6 secs : initdesign
## [mbo] 0: mtry=1; min.node.size=11; sample.fraction=0.485 : y = 0.884 : 2.9 secs : initdesign
## [mbo] 0: mtry=4; min.node.size=561; sample.fraction=0.7 : y = 0.902 : 7.9 secs : initdesign
## [mbo] 0: mtry=4; min.node.size=5.02e+03; sample.fraction=0.598 : y = 0.894 : 2.3 secs : initdesign
## [mbo] 0: mtry=2; min.node.size=282; sample.fraction=0.572 : y = 0.882 : 5.2 secs : initdesign
## [mbo] 0: mtry=2; min.node.size=35; sample.fraction=0.717 : y = 0.891 : 8.6 secs : initdesign
## [mbo] 0: mtry=3; min.node.size=32; sample.fraction=0.398 : y = 0.895 : 10.8 secs : initdesign
## [mbo] 0: mtry=1; min.node.size=3.01e+03; sample.fraction=0.674 : y = 0.89 : 2.0 secs : initdesign
## [mbo] 0: mtry=3; min.node.size=4; sample.fraction=0.635 : y = 0.971 : 26.8 secs : initdesign
## [mbo] 0: mtry=4; min.node.size=16; sample.fraction=0.59 : y = 0.949 : 24.4 secs : initdesign
## [mbo] 0: mtry=2; min.node.size=4e+03; sample.fraction=0.868 : y = 0.886 : 2.9 secs : initdesign
## [mbo] 0: mtry=1; min.node.size=177; sample.fraction=0.839 : y = 0.885 : 3.1 secs : initdesign
## [mbo] 0: mtry=2; min.node.size=6; sample.fraction=0.251 : y = 0.883 : 6.2 secs : initdesign
## [mbo] 0: mtry=4; min.node.size=1.5e+03; sample.fraction=0.303 : y = 0.89 : 2.7 secs : initdesign
## [mbo] 0: mtry=1; min.node.size=3; sample.fraction=0.799 : y = 0.885 : 3.3 secs : initdesign
## [mbo] 0: mtry=4; min.node.size=82; sample.fraction=0.898 : y = 0.999 : 20.8 secs : initdesign
## [mbo] 0: mtry=4; min.node.size=2; sample.fraction=0.275 : y = 0.916 : 14.1 secs : initdesign
## [mbo] 0: mtry=1; min.node.size=759; sample.fraction=0.415 : y = 0.886 : 2.4 secs : initdesign
## [mbo] 1: mtry=1; min.node.size=3; sample.fraction=0.201 : y = 0.885 : 3.0 secs : infill_cb
## [mbo] 2: mtry=4; min.node.size=44; sample.fraction=0.212 : y = 0.883 : 8.5 secs : infill_cb
## [mbo] 3: mtry=4; min.node.size=496; sample.fraction=0.514 : y = 0.895 : 6.4 secs : infill_cb
## [mbo] 4: mtry=2; min.node.size=866; sample.fraction=0.719 : y = 0.883 : 4.2 secs : infill_cb
## [mbo] 5: mtry=2; min.node.size=99; sample.fraction=0.293 : y = 0.879 : 4.6 secs : infill_cb
## [mbo] 6: mtry=1; min.node.size=15; sample.fraction=0.703 : y = 0.885 : 3.2 secs : infill_cb
## [mbo] 7: mtry=3; min.node.size=4.66e+03; sample.fraction=0.215 : y = 0.907 : 1.6 secs : infill_cb
## [mbo] 8: mtry=2; min.node.size=29; sample.fraction=0.201 : y = 0.88 : 4.7 secs : infill_cb
## [mbo] 9: mtry=3; min.node.size=194; sample.fraction=0.203 : y = 0.882 : 3.9 secs : infill_cb
## [mbo] 10: mtry=1; min.node.size=2; sample.fraction=0.9 : y = 0.885 : 3.1 secs : infill_cb
## [mbo] 11: mtry=1; min.node.size=16; sample.fraction=0.275 : y = 0.885 : 2.5 secs : infill_cb
## [mbo] 12: mtry=1; min.node.size=24; sample.fraction=0.889 : y = 0.885 : 3.2 secs : infill_cb
## [mbo] 13: mtry=1; min.node.size=5.34e+03; sample.fraction=0.304 : y = 0.921 : 1.9 secs : infill_cb
## [mbo] 14: mtry=1; min.node.size=2.33e+03; sample.fraction=0.898 : y = 0.888 : 1.9 secs : infill_cb
## [mbo] 15: mtry=2; min.node.size=525; sample.fraction=0.385 : y = 0.882 : 3.7 secs : infill_cb
## [mbo] 16: mtry=1; min.node.size=98; sample.fraction=0.564 : y = 0.885 : 2.6 secs : infill_cb
## [mbo] 17: mtry=3; min.node.size=24; sample.fraction=0.242 : y = 0.885 : 8.4 secs : infill_cb
## [mbo] 18: mtry=2; min.node.size=69; sample.fraction=0.225 : y = 0.879 : 5.0 secs : infill_cb
## [mbo] 19: mtry=1; min.node.size=2; sample.fraction=0.33 : y = 0.885 : 2.7 secs : infill_cb
## [mbo] 20: mtry=3; min.node.size=97; sample.fraction=0.276 : y = 0.88 : 7.4 secs : infill_cb
Mean of best 5 % of the results
res
## Recommended parameter settings:
## mtry min.node.size sample.fraction
## 1 2 78 0.2467632
## Results:
## mse exec.time
## 1 0.8787459 4.84
Recommended parameter settings: mtry min.node.size sample.fraction 1 2 55 0.2136541 Results: mse exec.time 1 0.933998 2.756667
Model with the new tuned hyperparameters
res$model
## Model for learner.id=regr.ranger; learner.class=regr.ranger
## Trained on: task.id = train; obs = 29337; features = 4
## Hyperparameters: num.threads=8,verbose=FALSE,respect.unordered.factors=order,mtry=2,min.node.size=78,sample.fraction=0.247,num.trees=1e+03,replace=FALSE
tuned_rangerReg <- ranger( price~ ., data = train, write.forest = TRUE, classification = F, mtry= 2,
min.node.size = 55, sample.fraction = 0.214,num.trees = 1000, replace= FALSE)
tuned_rangerReg
## Ranger result
##
## Call:
## ranger(price ~ ., data = train, write.forest = TRUE, classification = F, mtry = 2, min.node.size = 55, sample.fraction = 0.214, num.trees = 1000, replace = FALSE)
##
## Type: Regression
## Number of trees: 1000
## Sample size: 29337
## Number of independent variables: 4
## Mtry: 2
## Target node size: 55
## Variable importance mode: none
## Splitrule: variance
## OOB prediction error (MSE): 0.8787042
## R squared (OOB): 0.1015987
tuned_rangerReg_pred = predict(tuned_rangerReg, data = test)
tuned_rangerReg_pred
## Ranger prediction
##
## Type: Regression
## Sample size: 19558
## Number of independent variables: 4
cm = table(test[,14], pred)
library(caret) confusionMatrix(as.factor(pred), as.factor(test$price))
confusionMatrix( factor(pred, levels = 1:19558), factor(test$price, levels = 1:19558) )
library(ROCR)
## Warning: package 'ROCR' was built under R version 3.6.3
##
## Attaching package: 'ROCR'
## The following object is masked from 'package:mlr':
##
## performance
#ROCR
#roc_pred = predict(tuned_rangerReg,data = test, type="response")
#perf = prediction(roc_pred$predictions, test)
#==
#FIND OPTIMAL VALUE WITH MIN OUT OF BAG
#https://www.listendata.com/2014/11/random-forest-with-r.html
library(rpart) library(caret) library(e1071)
dt = train( neighbourhood_group~ ., data = train,method = “rpart”) pred2 = predict(dt, data = test) table(pred2, test)
https://medium.com/@brscntyz/neural-network-in-r-e275302b6e44 https://datascienceplus.com/fitting-neural-network-in-r/ https://www.kdnuggets.com/2016/08/begineers-guide-neural-networks-r.html/2
library(ISLR)
## Warning: package 'ISLR' was built under R version 3.6.3
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v tibble 3.0.1 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## v purrr 0.3.4
## Warning: package 'tibble' was built under R version 3.6.3
## Warning: package 'stringr' was built under R version 3.6.3
## Warning: package 'forcats' was built under R version 3.6.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x purrr::accumulate() masks foreach::accumulate()
## x lubridate::as.difftime() masks base::as.difftime()
## x randomForest::combine() masks dplyr::combine()
## x lubridate::date() masks base::date()
## x plotly::filter() masks dplyr::filter(), stats::filter()
## x lubridate::intersect() masks base::intersect()
## x dplyr::lag() masks stats::lag()
## x purrr::lift() masks caret::lift()
## x randomForest::margin() masks ggplot2::margin()
## x lubridate::setdiff() masks base::setdiff()
## x lubridate::stamp() masks cowplot::stamp()
## x lubridate::union() masks base::union()
## x purrr::when() masks foreach::when()
library("keras")
## Warning: package 'keras' was built under R version 3.6.3
##
## Attaching package: 'keras'
## The following object is masked _by_ '.GlobalEnv':
##
## normalize
## The following object is masked from 'package:future':
##
## %<-%
library(neuralnet)
## Warning: package 'neuralnet' was built under R version 3.6.3
##
## Attaching package: 'neuralnet'
## The following object is masked from 'package:ROCR':
##
## prediction
## The following object is masked from 'package:dplyr':
##
## compute
library(Hmisc)
## Loading required package: survival
##
## Attaching package: 'survival'
## The following object is masked from 'package:future':
##
## cluster
## The following object is masked from 'package:caret':
##
## cluster
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following object is masked from 'package:plotly':
##
## subplot
## The following object is masked from 'package:mlr':
##
## impute
## The following object is masked from 'package:e1071':
##
## impute
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
m <- model.matrix(
~price+neighbourhood_group+room_type+longitude+latitude,
data = train
)
m_test <- model.matrix(
~price+neighbourhood_group+room_type+longitude+latitude,
data = test
)
head(m)
## (Intercept) price neighbourhood_group2 neighbourhood_group3
## 1 1 -0.01549291 0 0
## 3 1 -0.01132892 1 0
## 5 1 -0.30280835 1 0
## 6 1 0.19687067 1 0
## 8 1 -0.30697234 1 0
## 10 1 -0.01132892 1 0
## neighbourhood_group4 neighbourhood_group5 room_type2 room_type3 longitude
## 1 0 0 0 0 -0.4376476
## 3 0 0 0 0 0.2224944
## 5 0 0 1 0 0.1772139
## 6 0 0 1 0 -0.4946274
## 8 0 0 0 0 -0.7097639
## 10 0 0 1 0 -0.8276232
## latitude
## 1 -1.4938339
## 3 1.4683845
## 5 1.2756468
## 6 0.3433173
## 8 0.6591063
## 10 -0.2844097
#nn=neuralnet(price~ neighbourhood_group2+ neighbourhood_group3 +neighbourhood_group4+ neighbourhood_group5+ room_type2+ room_type3+longitude+latitude,data=m, hidden=10,act.fct = "logistic",
# linear.output = TRUE,stepmax=10^5,threshold = 0.01)
Also you can change your hidden layers by specifiying with numbers in vector like this
#nn=neuralnet( price~ neighbourhood_group2+ neighbourhood_group3 +neighbourhood_group4+ neighbourhood_group5+ room_type2+ room_type3+longitude+latitude,data=m, hidden=c(7,6,5),act.fct = "logistic",
# linear.output = TRUE,stepmax=10^5,threshold = 0.01)
#hidden=c(7,6,5)
Then, prediction and calculation of error comes. I calculate the error with Root mean error method.
nn_pred=compute(nn,test[,1:13]) nn_pred\(net.resultRMSE <- function(actual,predicted) { return(sqrt(sum(actual^2-predicted^2)/length(actual))) } summary(nn_pred) nn_pred <- is.numeric(nn_pred) RMSE(test\)price,nn_pred)
plot(test\(price,nn_pred\)net.result)
pr.nn_ <- nn_pred\(net.result*(max(dataset\)price)-min(dataset\(price))+min(dataset\)price)
https://datascienceplus.com/fitting-neural-network-in-r/
lm.fit <- glm(price~., data=train)
summary(lm.fit)
##
## Call:
## glm(formula = price ~ ., data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.009 -0.254 -0.102 0.044 41.017
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.361387 0.012348 -29.267 < 2e-16 ***
## neighbourhood_group2 0.271929 0.019299 14.091 < 2e-16 ***
## neighbourhood_group3 0.112169 0.023815 4.710 2.49e-06 ***
## neighbourhood_group4 -0.463934 0.068425 -6.780 1.22e-11 ***
## neighbourhood_group5 0.189675 0.046677 4.064 4.85e-05 ***
## latitude -0.054288 0.009057 -5.994 2.07e-09 ***
## longitude -0.080103 0.008788 -9.115 < 2e-16 ***
## room_type2 0.440940 0.011498 38.348 < 2e-16 ***
## room_type3 -0.091403 0.036869 -2.479 0.0132 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.893665)
##
## Null deviance: 28693 on 29336 degrees of freedom
## Residual deviance: 26209 on 29328 degrees of freedom
## AIC: 79968
##
## Number of Fisher Scoring iterations: 2
pr.lm <- predict(lm.fit,test)
MSE.lm <- sum((pr.lm - test$price)^2)/nrow(test)
par(mfrow=c(1,2))
#plot(test$price,nn_pred$net.result,col='red',main='Real vs predicted NN',pch=18,cex=0.7)
#abline(0,1,lwd=2)
#legend('bottomright',legend='NN',pch=18,col='red', bty='n')
plot(test$price,pr.lm,col='blue',main='Real vs predicted Linear Regression',pch=18, cex=0.7)
abline(0,1,lwd=2)
legend('bottomright',legend='LM',pch=18,col='blue', bty='n', cex=.95)